#importing libraries
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
data=pd.read_csv("insurance.csv")
data.head()
| age | sex | bmi | children | smoker | region | expenses | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.9 | 0 | yes | southwest | 16884.92 |
| 1 | 18 | male | 33.8 | 1 | no | southeast | 1725.55 |
| 2 | 28 | male | 33.0 | 3 | no | southeast | 4449.46 |
| 3 | 33 | male | 22.7 | 0 | no | northwest | 21984.47 |
| 4 | 32 | male | 28.9 | 0 | no | northwest | 3866.86 |
data.shape #dimenssion of our dataset
(1338, 7)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1338 entries, 0 to 1337 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1338 non-null int64 1 sex 1338 non-null object 2 bmi 1338 non-null float64 3 children 1338 non-null int64 4 smoker 1338 non-null object 5 region 1338 non-null object 6 expenses 1338 non-null float64 dtypes: float64(2), int64(2), object(3) memory usage: 73.3+ KB
data.describe(include=np.number).transpose() #simple summary report
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| age | 1338.0 | 39.207025 | 14.049960 | 18.00 | 27.0000 | 39.00 | 51.000 | 64.00 |
| bmi | 1338.0 | 30.665471 | 6.098382 | 16.00 | 26.3000 | 30.40 | 34.700 | 53.10 |
| children | 1338.0 | 1.094918 | 1.205493 | 0.00 | 0.0000 | 1.00 | 2.000 | 5.00 |
| expenses | 1338.0 | 13270.422414 | 12110.011240 | 1121.87 | 4740.2875 | 9382.03 | 16639.915 | 63770.43 |
#frequency of insurence for different age
fig = px.histogram(data['age'])
fig.show()
#Expenses due to different age
age_group_max=data.groupby('age').max()['expenses']
age_group_min=data.groupby('age').min()['expenses']
age_group_mean=data.groupby('age').mean()['expenses']
age_group=data.groupby('age').sum()['expenses']
fig = go.Figure()
fig.add_trace(go.Scatter(x=age_group.index,y=age_group.values,mode='lines+markers',name='Total expenses'))
fig.add_trace(go.Scatter(x=age_group_max.index,y=age_group_max.values,mode='lines+markers',name='Max expenses'))
fig.add_trace(go.Scatter(x=age_group_min.index,y=age_group_min.values,mode='lines+markers',name='Min expenses'))
fig.add_trace(go.Scatter(x=age_group_mean.index,y=age_group_mean.values,mode='lines+markers',name='Mean expenses'))
fig.update_layout(title='Expenses due to different age',
xaxis_title='Age',
yaxis_title='Expenses')
fig.show()
#Expenses for various bmi index
bmi_group=data.groupby('bmi').mean()['expenses']
fig = go.Figure()
fig.add_trace(go.Scatter(x=bmi_group.index,y=bmi_group.values,stackgroup = 'one'))
fig.update_layout(title='Expenses for various bmi index',xaxis_title='bmi',yaxis_title='Expenses')
fig.show()
#expenses changes for different childeren no
child_group=data.groupby('children').sum()['expenses']
fig = px.bar(child_group)
fig.update_layout(title='Expenses for different childeren no',yaxis_title='Expenses')
fig.show()
smoker_group=data.groupby('smoker').sum()['expenses']
fig = px.pie(values=smoker_group.values,names=smoker_group.index, title='pertcentage of smoker and non-smoker expenses')
fig.show() #Yes: smoker ,No: non-smoker
#expenses changes for different childeren no
region_group=data.groupby('region').sum()['expenses']
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']
fig = go.Figure(data=[go.Pie(labels=region_group.index,
values=region_group.values,hole=.5)])
fig.update_traces(hoverinfo='value+percent', textinfo='label', textfont_size=20,
marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()
ax=data['expenses'].plot.hist(density=True,bins=range(1,12))
data["expenses"].plot.density(ax=ax)
C:\Users\SVMY\anaconda3\lib\site-packages\numpy\lib\histograms.py:905: RuntimeWarning: invalid value encountered in true_divide
<AxesSubplot:ylabel='Density'>
ax = sns.heatmap(data.corr(), annot=True)
fig = px.violin(data, y="expenses", color="sex",points='all',box=True,
violinmode='overlay', # draw violins on top of each other
# default violinmode is 'group' as in example above
hover_data=data.columns)
fig.update_layout(yaxis_zeroline=False)
fig.show()
#remove outlier from expenses
q_low = data["expenses"].quantile(0.01)
q_hi = data["expenses"].quantile(0.99)
data = data[(data["expenses"] < q_hi) & (data["expenses"] > q_low)]
data.shape
(1310, 7)
df = pd.get_dummies(data, columns=['sex','smoker',"region"])
df
| age | bmi | children | expenses | sex_female | sex_male | smoker_no | smoker_yes | region_northeast | region_northwest | region_southeast | region_southwest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 19 | 27.9 | 0 | 16884.92 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 1 | 18 | 33.8 | 1 | 1725.55 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 28 | 33.0 | 3 | 4449.46 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 |
| 3 | 33 | 22.7 | 0 | 21984.47 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
| 4 | 32 | 28.9 | 0 | 3866.86 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | 31.0 | 3 | 10600.55 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
| 1334 | 18 | 31.9 | 0 | 2205.98 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 1335 | 18 | 36.9 | 0 | 1629.83 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 1336 | 21 | 25.8 | 0 | 2007.95 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 1337 | 61 | 29.1 | 0 | 29141.36 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
1310 rows × 12 columns
x=df.drop("expenses", axis='columns')
y=df['expenses']
from sklearn.model_selection import train_test_split
#data slipt into 80% and 20%
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(x_train, y_train)
LinearRegression()
from sklearn.metrics import confusion_matrix
y_pred=regr.predict(x_test)
from sklearn.metrics import r2_score
r=round(r2_score(y_test, y_pred),2)*100
print("accuracy of this model is :",int(r),"%")
accuracy of this model is : 75 %
from sklearn.metrics import mean_squared_error
from math import sqrt
rms = sqrt(mean_squared_error(y_test, y_pred))
rms
13234.176301857424